# ============================================================
# Home Data for ML Course (Kaggle Learn Users)
# ブレンド（平均）でRMSEを下げる：CatBoost + LightGBM
#
# なぜ平均が効く？
# - CatBoost と LightGBM は「得意な誤差」が違う
#   例）CatBoostはカテゴリ処理が上手い、LGBMは数値の分割が鋭い…など
# - 2つの予測を平均すると「それぞれのミスが打ち消される」ことが多い
# - この入門コンペで 12000 に近づく王道の一手
#
# 実行手順（Kaggle）
# 1) このセルをそのまま実行
# 2) submission.csv が生成される
# 3) 提出して Public Score を確認
# ============================================================

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool


# ============================================================
# 1) Load
# ============================================================
train = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
test  = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

# よく効く外れ値除去（あなたが使ってきた定番）
train = train.drop(train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index)

y = train["SalePrice"].copy()
X = train.drop(columns=["SalePrice"]).copy()
X_test = test.copy()


# ============================================================
# 2) Feature engineering（軽量で効きやすい派生）
#  - CatBoost/LGBMどちらにも効く “無難に強い” 派生のみ
# ============================================================
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 面積系：欠損は「ない=0」が自然な列が多い
    for c in ["TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "GarageArea"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)

    # 合計面積（強い）
    df["TotalSF"] = df.get("TotalBsmtSF", 0) + df.get("1stFlrSF", 0) + df.get("2ndFlrSF", 0)

    # 築年数・改築年数（強い）
    if "YrSold" in df.columns and "YearBuilt" in df.columns:
        df["HouseAge"] = df["YrSold"] - df["YearBuilt"]
    if "YrSold" in df.columns and "YearRemodAdd" in df.columns:
        df["RemodAge"] = df["YrSold"] - df["YearRemodAdd"]

    # 風呂合計（地味に効く）
    for c in ["FullBath", "HalfBath", "BsmtFullBath", "BsmtHalfBath"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)
    if all(c in df.columns for c in ["FullBath","HalfBath","BsmtFullBath","BsmtHalfBath"]):
        df["TotalBath"] = df["FullBath"] + 0.5*df["HalfBath"] + df["BsmtFullBath"] + 0.5*df["BsmtHalfBath"]

    # 品質×面積（強い）
    if "OverallQual" in df.columns and "GrLivArea" in df.columns:
        df["Qual_x_GrLivArea"] = df["OverallQual"] * df["GrLivArea"]
    if "OverallQual" in df.columns:
        df["Qual_x_TotalSF"] = df["OverallQual"] * df["TotalSF"]

    return df

X = add_features(X)
X_test = add_features(X_test)


# ============================================================
# 3) 目的変数：log1p で学習して、最後に expm1 で戻す
#  - RMSE（ドル）でもlog学習が効くことが多い
# ============================================================
y_log = np.log1p(y)


# ============================================================
# 4) CatBoost用の準備（カテゴリ列）
#  - CatBoostはカテゴリをそのまま扱える（One-Hot不要）
#  - ただし object の欠損は文字埋めしておくと安全
# ============================================================
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
cat_idx = [X.columns.get_loc(c) for c in cat_cols]

# カテゴリ欠損は "Missing" に置換
X_cb = X.copy()
X_test_cb = X_test.copy()
X_cb[cat_cols] = X_cb[cat_cols].fillna("Missing")
X_test_cb[cat_cols] = X_test_cb[cat_cols].fillna("Missing")

# 数値欠損は中央値補完（CatBoostは欠損に強いが、安定化のため）
num_cols_cb = X_cb.columns.difference(cat_cols).tolist()
med = X_cb[num_cols_cb].median()
X_cb[num_cols_cb] = X_cb[num_cols_cb].fillna(med)
X_test_cb[num_cols_cb] = X_test_cb[num_cols_cb].fillna(med)


# ============================================================
# 5) LightGBM用の準備（One-Hot）
#  - One-Hotはfoldで列数が変わりやすいので
#    train+testでfitして列を固定 → その後KFold
# ============================================================
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols_lgb = X.select_dtypes(include=["object"]).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True))
        ]), cat_cols_lgb),
    ],
    remainder="drop"
)

X_all = pd.concat([X, X_test], axis=0, ignore_index=True)
X_all_enc = preprocess.fit_transform(X_all)
X_lgb = X_all_enc[:len(X)]
X_test_lgb = X_all_enc[len(X):]


# ============================================================
# 6) KFold で CatBoost / LGBM をそれぞれ学習 → test予測を平均
#  - 最後に 2モデルの予測をさらに平均（ブレンド）
# ============================================================
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_cb_log = np.zeros(len(X_cb))
oof_lgb_log = np.zeros(len(X_cb))

test_cb = np.zeros(len(X_test_cb))
test_lgb = np.zeros(len(X_test_cb))

for fold, (tr_idx, va_idx) in enumerate(kf.split(X_cb), start=1):
    # ----- split -----
    X_tr_cb, X_va_cb = X_cb.iloc[tr_idx], X_cb.iloc[va_idx]
    y_tr, y_va = y_log.iloc[tr_idx], y_log.iloc[va_idx]

    # ----- CatBoost -----
    train_pool = Pool(X_tr_cb, y_tr, cat_features=cat_idx)
    valid_pool = Pool(X_va_cb, y_va, cat_features=cat_idx)
    test_pool  = Pool(X_test_cb, cat_features=cat_idx)

    cb = CatBoostRegressor(
        loss_function="RMSE",
        iterations=20000,        # 早期終了で止まるので多め
        learning_rate=0.03,
        depth=8,
        l2_leaf_reg=3.0,
        random_seed=42,
        od_type="Iter",
        od_wait=300,
        verbose=0
    )
    cb.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    pred_va_cb_log = cb.predict(valid_pool)
    oof_cb_log[va_idx] = pred_va_cb_log
    test_cb += np.expm1(cb.predict(test_pool)) / kf.n_splits   # priceに戻して平均

    # ----- LightGBM -----
    X_tr_lgb, X_va_lgb = X_lgb[tr_idx], X_lgb[va_idx]

    lgb = LGBMRegressor(
        n_estimators=20000,
        learning_rate=0.01,
        num_leaves=256,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.7,
        random_state=42,
        n_jobs=-1
    )
    lgb.fit(X_tr_lgb, y_tr)

    pred_va_lgb_log = lgb.predict(X_va_lgb)
    oof_lgb_log[va_idx] = pred_va_lgb_log
    test_lgb += np.expm1(lgb.predict(X_test_lgb)) / kf.n_splits

    # ----- 参考：foldごとのRMSE（price） -----
    # ブレンド前に、各モデル単体の雰囲気を見る
    rmse_cb  = mean_squared_error(np.expm1(y_va), np.expm1(pred_va_cb_log), squared=False)
    rmse_lgb = mean_squared_error(np.expm1(y_va), np.expm1(pred_va_lgb_log), squared=False)
    print(f"[Fold {fold}] RMSE(price)  CatBoost: {rmse_cb:.2f} / LGBM: {rmse_lgb:.2f}")

# ============================================================
# 7) ブレンド（単純平均）
#  - まずは 0.5 / 0.5 の平均が最も堅い
#  - 次にスコアを見て、重み（0.6/0.4など）を微調整すると伸びることがある
# ============================================================
blend_pred = 0.5 * test_cb + 0.5 * test_lgb

# CV上のブレンド評価（参考）
oof_blend_log = 0.5 * oof_cb_log + 0.5 * oof_lgb_log
cv_rmse = mean_squared_error(np.expm1(y_log), np.expm1(oof_blend_log), squared=False)
print(f"\n[CV] RMSE(price) Blend(0.5/0.5): {cv_rmse:.2f}")

# ============================================================
# 8) Submission
# ============================================================
submission = pd.DataFrame({"Id": test["Id"], "SalePrice": blend_pred})
submission.to_csv("submission.csv", index=False)
print("✅ saved: submission.csv")
